This module builds on code contained in Coronavirus_Statistics_USAF_v006.Rmd. This file includes the latest code for analyzing data from USA Facts. USA Facts maintains data on cases and deaths by county for coronavirus in the US. Downloaded data are unique by county with date as a column and a separate file for each of cases, deaths, and population.
The intent of this code is to source updated functions that allow for readRunUSAFacts() to be run to obtain, read, process, and analyze data from USA Facts.
The tidyverse library is loaded, and the functions used for CDC daily processing are sourced. Additionally, specific functions for USA Facts are also sourced:
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# Functions are available in source file
source("./Generic_Added_Utility_Functions_202105_v001.R")
source("./Coronavirus_CDC_Daily_Functions_v002.R")
source("./Coronavirus_USAF_Functions_v001.R")
Further, the mapping file specific to USA Facts is sourced:
source("./Coronavirus_USAF_Default_Mappings_v002.R")
Updated functions for diagnoseClusters(), createDetailedSummaries(), createSummary(), and helperSummaryMap() are included in Coronavirus_USAF_Functions_v001.R. These functions should be checked for consistency with state-level data with just a single copy kept later.
The latest county-level burden data are downloaded:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20220913.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20220913.csv"
)
compareList <- list("usafCase"=readFromRDS("cty_newdata_20220807")$dfRaw$usafCase,
"usafDeath"=readFromRDS("cty_newdata_20220807")$dfRaw$usafDeath
)
# Use existing clusters
cty_newdata_20220913 <- readRunUSAFacts(maxDate="2022-09-11",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20220913_chk_v005.log",
ovrwriteLog=TRUE,
useClusters=readFromRDS("cty_newdata_20210813")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20220913.csv
## Rows: 3193 Columns: 962
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): County Name, State, StateFIPS
## dbl (959): countyFIPS, 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 33
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20220913_chk_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 2 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20220913_chk_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 128 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20220913_chk_v005.log
## Rows: 3193 Columns: 962
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): County Name, State, StateFIPS
## dbl (959): countyFIPS, 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 33
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20220913_chk_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 3 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20220913_chk_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 108 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20220913_chk_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 × 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 3.45e+10 93305986 3058894
## 2 after 3.43e+10 91158859 3010036
## 3 pctchg 7.16e- 3 0.0230 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 × 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 5.12e+8 1034939 3058894
## 2 after 4.95e+8 964043 3010036
## 3 pctchg 3.23e-2 0.0685 0.0160
## NULL
# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20220913$useClusters,
caption="Includes only counties with 25k+ population",
brewPalette="viridis"
)
# Save the refreshed file
saveToRDS(cty_newdata_20220913, ovrWriteError=FALSE)
Vaccines data are also updated, though the process needs to integrate previous data:
cty_vaxdata_20220914 <- processCountyVaccines(loc="./RInputFiles/Coronavirus/county_vaccine_20220914.csv",
ctyList=cty_newdata_20220913,
minDateCD=c("2022-06-09", "2022-06-09"),
maxDateCD="2022-09-01"
)
## Rows: 78961 Columns: 72
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Date, FIPS, Recip_County, Recip_State, SVI_CTGY, Metro_status
## dbl (66): MMWR_week, Completeness_pct, Administered_Dose1_Recip, Administere...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##
## Records from other than 50 states and DC:
## # A tibble: 9 × 2
## state n
## <chr> <int>
## 1 AS 24
## 2 FM 25
## 3 GU 48
## 4 MH 24
## 5 MP 24
## 6 PR 1901
## 7 PW 24
## 8 VI 96
## 9 <NA> 17
## Warning: Removed 16 rows containing non-finite values (stat_boxplot).
## Warning: Removed 16 rows containing non-finite values (stat_boxplot).
## Warning: Removed 16 rows containing non-finite values (stat_boxplot).
##
## Count of NA records by column
## state FIPS popgte65_minpop popgte65_maxpop popgte65_nnA
## 0 0 0 0 0
## n
## 0
##
## Records where minimum and maximum population differ# A tibble: 0 × 5
## # … with 5 variables: state <chr>, FIPS <chr>, age <chr>, minpop <dbl>,
## # maxpop <dbl>
## # ℹ Use `colnames()` to see all variable names
##
##
##
## Will run with parameters:
## burdenVar: cpm dpm
## vaxVar: vxcpoppct vxcpoppct
## minDateCD: 2022-06-09 2022-06-09
## maxDateCD: 2022-09-01 2022-09-01
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Warning: Removed 16 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Removed 16 rows containing missing values (geom_point).
##
## Call:
## lm(formula = get(burdenVar) ~ vaxMetric, data = dfReg, weights = pop)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -289154037 -1505413 -90865 1548407 62031339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20807.226 2187.230 9.513 <2e-16 ***
## vaxMetric 8.061 33.809 0.238 0.812
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7367000 on 3124 degrees of freedom
## (16 observations deleted due to missingness)
## Multiple R-squared: 1.82e-05, Adjusted R-squared: -0.0003019
## F-statistic: 0.05684 on 1 and 3124 DF, p-value: 0.8116
##
##
## Call:
## lm(formula = get(burdenVar) ~ vaxMetric * type + 0 - vaxMetric,
## data = dfReg, weights = pop)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -288389946 -1463009 -64586 1597997 65055859
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## type<25k 12176.47 7942.37 1.533 0.125352
## type>500k 29162.62 4682.01 6.229 5.34e-10 ***
## type100k-500k 17769.84 4666.92 3.808 0.000143 ***
## type25k-100k 17521.42 5263.39 3.329 0.000882 ***
## vaxMetric:type<25k 175.75 159.92 1.099 0.271861
## vaxMetric:type>500k -110.85 66.36 -1.670 0.094933 .
## vaxMetric:type100k-500k 60.38 74.96 0.805 0.420630
## vaxMetric:type25k-100k 66.14 99.33 0.666 0.505574
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7368000 on 3118 degrees of freedom
## (16 observations deleted due to missingness)
## Multiple R-squared: 0.4677, Adjusted R-squared: 0.4663
## F-statistic: 342.4 on 8 and 3118 DF, p-value: < 2.2e-16
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Removed 16 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Removed 16 rows containing missing values (geom_point).
##
## Call:
## lm(formula = get(burdenVar) ~ vaxMetric, data = dfReg, weights = pop)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -3529509 -12224 -2495 14425 697276
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 298.9805 29.5946 10.103 <2e-16 ***
## vaxMetric -4.0106 0.4575 -8.767 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 99680 on 3124 degrees of freedom
## (16 observations deleted due to missingness)
## Multiple R-squared: 0.02401, Adjusted R-squared: 0.0237
## F-statistic: 76.86 on 1 and 3124 DF, p-value: < 2.2e-16
##
##
## Call:
## lm(formula = get(burdenVar) ~ vaxMetric * type + 0 - vaxMetric,
## data = dfReg, weights = pop)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -3480323 -13174 -5106 10332 707402
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## type<25k 130.2314 107.0422 1.217 0.22384
## type>500k 383.7067 63.1011 6.081 1.34e-09 ***
## type100k-500k 99.6779 62.8978 1.585 0.11312
## type25k-100k 207.0272 70.9366 2.918 0.00354 **
## vaxMetric:type<25k -0.3702 2.1553 -0.172 0.86365
## vaxMetric:type>500k -5.4661 0.8943 -6.112 1.11e-09 ***
## vaxMetric:type100k-500k -0.5040 1.0103 -0.499 0.61791
## vaxMetric:type25k-100k -1.8241 1.3388 -1.362 0.17314
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 99300 on 3118 degrees of freedom
## (16 observations deleted due to missingness)
## Multiple R-squared: 0.05224, Adjusted R-squared: 0.04981
## F-statistic: 21.48 on 8 and 3118 DF, p-value: < 2.2e-16
# Save the refreshed file
saveToRDS(cty_vaxdata_20220914, ovrWriteError=FALSE)
County-level data are post-processed:
cty_postdata_20220913 <- postProcessCountyData(lstCtyBurden=cty_newdata_20220913$dfPerCapita,
lstCtyVax=cty_vaxdata_20220914$vaxFix,
lstState=readFromRDS("cdc_daily_220902")$dfPerCapita
)
##
## Parameter maxDate is: 2022-09-01
# Save the refreshed file
saveToRDS(cty_postdata_20220913, ovrWriteError=FALSE)
Additional post-processing steps are run:
# Step 1a: Burden comparisons for aggregated states
additionalCountyPostProcess(cty_postdata_20220913, p1CompareStates=c(state.abb, "DC"), p1AggData=TRUE)
## Warning: Removed 6 row(s) containing missing values (geom_path).
# Step 1: Burden aggregation for key states
# Step 2: vaccine comparisons
# Step 3: Scoring updates (and errors)
# Step 4: New rolling data (28-day default with ceilings 50000 CPM, 500 DPM)
additionalCountyPostProcess(cty_postdata_20220913,
p1CompareStates=c("GA", "FL", "NE"),
p2VaxStates=c("MA", "HI", "TX", "VA", "VT", "GA", "CO", "SD"),
p3VaxTimes=sort(c("2022-01-01", "2022-08-31")),
p4DF=cty_newdata_20220913$dfPerCapita
)
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 1 row(s) containing missing values (geom_path).
Additional plots are also updated:
# Creating the vaccine and burden data
tmpVaxBurden <- createVaxBurdenData(lstVax=cty_vaxdata_20220914, lstBurden=cty_newdata_20220913)
# Nationwide aggregation, excluding problem states
problemStates <- c("VA", "TX", "SD", "HI", "GA", "CO", "GA", "FL", "NE", "VA")
useStates <- state.abb
tmpCountyList <- tmpVaxBurden$ctyPop %>%
filter(state %in% useStates, !(state %in% problemStates)) %>%
left_join(filter(tmpVaxBurden$dfVaxBurden, name=="vxcpoppct", date==max(date)), by="countyFIPS") %>%
mutate(vaxPct=percent_rank(value),
vaxBucket=case_when(vaxPct <= .25 ~ "3. Low", vaxPct >= .75 ~ "1. High", TRUE ~ "2. Medium")
) %>%
split(f=.$vaxBucket)
# Plot of absolute burden
plotVaxBurdenData(tmpVaxBurden,
ctyPlot=lapply(tmpCountyList, FUN=function(x) x %>% rename(bucket=vaxBucket)),
plotTitle="Counties in states with continuous county data"
)
# Plots of burden relative to May 2021
plotVaxBurdenData(tmpVaxBurden,
ctyPlot=lapply(tmpCountyList, FUN=function(x) x %>% rename(bucket=vaxBucket)),
plotTitle="Counties in states with continuous county data",
scaleToDate="2021-05-01"
)
Multiple data anomalies need to be addressed - vaccines data covering only a short time period, counties significantly revising burden downwards, etc.
Counties with significant negative burden are investigated:
keyRatio <- cty_newdata_20220913$dfPerCapita %>%
select(countyFIPS, state, date, cases, deaths) %>%
group_by(countyFIPS, state) %>%
summarize(keyRatDeath=sum(ifelse(date==max(date), deaths, 0)/max(deaths)),
keyRatCases=sum(ifelse(date==max(date), cases, 0)/max(cases)),
.groups="drop"
)
keyRatio %>%
mutate(across(where(is.numeric), .fns=function(x) round(x, 3))) %>%
count(keyRatCases, keyRatDeath) %>%
ggplot(aes(x=keyRatCases, y=keyRatDeath)) +
geom_point(aes(size=n)) +
lims(x=c(0, 1), y=c(0, 1)) +
labs(x="Latest cases vs. maximum cases",
y="Latest deaths vs. maximum deaths",
title="County-level restatement"
)
## Warning: Removed 6 rows containing missing values (geom_point).
keyRatio %>%
mutate(across(where(is.numeric), .fns=function(x) round(x, 3))) %>%
count(keyRatCases, keyRatDeath) %>%
filter(keyRatCases < 1 | keyRatDeath < 1) %>%
ggplot(aes(x=keyRatCases, y=keyRatDeath)) +
geom_point(aes(size=n)) +
lims(x=c(0, 1), y=c(0, 1)) +
labs(x="Latest cases vs. maximum cases",
y="Latest deaths vs. maximum deaths",
title="County-level restatement",
subtitle="Only counties with at least one ratio .999 or lower included"
)
## Warning: Removed 4 rows containing missing values (geom_point).
Restatement of deaths is much more common than restatement of cases. Counties with declines are further explored:
decStatus <- keyRatio %>%
mutate(rd=case_when(is.na(keyRatDeath) ~ -1,
keyRatDeath==1 ~ 1,
keyRatDeath>=.95 ~ .95,
keyRatDeath>=.9 ~ .9,
keyRatDeath>=.75 ~ .75,
keyRatDeath>=.5 ~ .5,
TRUE ~ 0
)
)
decStatus %>%
count(rd)
## # A tibble: 7 × 2
## rd n
## <dbl> <int>
## 1 -1 20
## 2 0 17
## 3 0.5 23
## 4 0.75 61
## 5 0.9 34
## 6 0.95 118
## 7 1 2869
decStatus %>%
count(rd, state) %>%
filter(rd<.95, rd>=0, n>1) %>%
arrange(rd, -n)
## # A tibble: 18 × 3
## rd state n
## <dbl> <chr> <int>
## 1 0 NE 4
## 2 0 AK 3
## 3 0 TX 3
## 4 0 MA 2
## 5 0.5 IL 10
## 6 0.5 NE 9
## 7 0.5 AK 2
## 8 0.75 IL 37
## 9 0.75 MA 11
## 10 0.75 NE 4
## 11 0.75 KS 3
## 12 0.75 VA 2
## 13 0.9 IL 17
## 14 0.9 NE 4
## 15 0.9 CA 2
## 16 0.9 CO 2
## 17 0.9 FL 2
## 18 0.9 VA 2
cty_newdata_20220913$dfPerCapita %>%
left_join(select(decStatus, countyFIPS, state, rd), by=c("countyFIPS", "state")) %>%
group_by(rd, date) %>%
summarize(cases=sum(cases), deaths=sum(deaths), n=n(), .groups="drop") %>%
mutate(labFacet=paste0(rd, " (n=", n, ")")) %>%
ggplot(aes(x=date, y=deaths)) +
geom_line() +
facet_wrap(~labFacet, scales="free_y") +
labs(x=NULL,
y="Reported deaths",
title="Reported deaths, facetted by change from maximum"
)
Illinois and Nebraska appear to be main drivers of reported declines (discontinuities) in deaths. Reporting is potentially lagged, as much of the issue appears to be recent.
Restatement of cases is also explored:
decStatus <- keyRatio %>%
mutate(rd=case_when(is.na(keyRatCases) ~ -1,
keyRatCases==1 ~ 1,
keyRatCases>=.99 ~ .99,
keyRatCases>=.95 ~ .95,
keyRatCases>=.5 ~ .5,
TRUE ~ 0
)
)
decStatus %>%
count(rd)
## # A tibble: 6 × 2
## rd n
## <dbl> <int>
## 1 -1 2
## 2 0 4
## 3 0.5 10
## 4 0.95 16
## 5 0.99 69
## 6 1 3041
decStatus %>%
count(rd, state) %>%
filter(rd<.99, rd>=0, n>1) %>%
arrange(rd, -n)
## # A tibble: 5 × 3
## rd state n
## <dbl> <chr> <int>
## 1 0 TX 3
## 2 0.5 NV 4
## 3 0.5 UT 2
## 4 0.95 NE 6
## 5 0.95 VA 6
cty_newdata_20220913$dfPerCapita %>%
left_join(select(decStatus, countyFIPS, state, rd), by=c("countyFIPS", "state")) %>%
group_by(rd, date) %>%
summarize(cases=sum(cases), deaths=sum(deaths), n=n(), .groups="drop") %>%
mutate(labFacet=paste0(rd, " (n=", n, ")")) %>%
ggplot(aes(x=date, y=cases)) +
geom_line() +
facet_wrap(~labFacet, scales="free_y") +
labs(x=NULL,
y="Reported cases",
title="Reported cases, facetted by change from maximum"
)
In aggregate, the cases look OK, with the exception of a few counties that may have incomplete reporting in the most recent time period. Specific county declines are further explored:
decStatus <- keyRatio %>%
mutate(rd=case_when(is.na(keyRatDeath) ~ -1,
keyRatDeath==1 ~ 1,
keyRatDeath>=.95 ~ .95,
keyRatDeath>=.9 ~ .9,
keyRatDeath>=.75 ~ .75,
keyRatDeath>=.5 ~ .5,
TRUE ~ 0
)
)
decStatus %>%
count(rd)
## # A tibble: 7 × 2
## rd n
## <dbl> <int>
## 1 -1 20
## 2 0 17
## 3 0.5 23
## 4 0.75 61
## 5 0.9 34
## 6 0.95 118
## 7 1 2869
decStatus %>%
count(rd, state) %>%
filter(rd<.95, rd>=0, n>1) %>%
arrange(rd, -n)
## # A tibble: 18 × 3
## rd state n
## <dbl> <chr> <int>
## 1 0 NE 4
## 2 0 AK 3
## 3 0 TX 3
## 4 0 MA 2
## 5 0.5 IL 10
## 6 0.5 NE 9
## 7 0.5 AK 2
## 8 0.75 IL 37
## 9 0.75 MA 11
## 10 0.75 NE 4
## 11 0.75 KS 3
## 12 0.75 VA 2
## 13 0.9 IL 17
## 14 0.9 NE 4
## 15 0.9 CA 2
## 16 0.9 CO 2
## 17 0.9 FL 2
## 18 0.9 VA 2
cty_newdata_20220913$dfPerCapita %>%
left_join(select(decStatus, countyFIPS, state, rd), by=c("countyFIPS", "state")) %>%
filter(rd > 0, rd < 0.9) %>%
ggplot(aes(x=date, y=deaths)) +
geom_line(aes(group=countyFIPS, color=state)) +
facet_wrap(~rd, scales="free_y") +
labs(x=NULL,
y="Reported deaths by county",
title="Reported deaths, facetted by change from maximum"
)
There are two separate issues - some counties appear to have incomplete data in the latest time period, while other counties appear to have significant negative restatement of data earlier in 2022
The ratio process is updated:
keyRatioDate <- cty_newdata_20220913$dfPerCapita %>%
select(countyFIPS, state, date, cases, deaths) %>%
pivot_longer(-c(countyFIPS, state, date)) %>%
arrange(countyFIPS, state, name, date) %>%
group_by(countyFIPS, state, name) %>%
mutate(ratMax=value/max(value, na.rm=TRUE), cumMax=value/cummax(value)) %>%
ungroup()
keyRatioDate
## # A tibble: 6,020,072 × 7
## countyFIPS state date name value ratMax cumMax
## <chr> <chr> <date> <chr> <dbl> <dbl> <dbl>
## 1 01001 AL 2020-01-22 cases 0 0 NaN
## 2 01001 AL 2020-01-23 cases 0 0 NaN
## 3 01001 AL 2020-01-24 cases 0 0 NaN
## 4 01001 AL 2020-01-25 cases 0 0 NaN
## 5 01001 AL 2020-01-26 cases 0 0 NaN
## 6 01001 AL 2020-01-27 cases 0 0 NaN
## 7 01001 AL 2020-01-28 cases 0 0 NaN
## 8 01001 AL 2020-01-29 cases 0 0 NaN
## 9 01001 AL 2020-01-30 cases 0 0 NaN
## 10 01001 AL 2020-01-31 cases 0 0 NaN
## # … with 6,020,062 more rows
## # ℹ Use `print(n = ...)` to see more rows
dfMinMax <- keyRatioDate %>%
filter(!is.na(cumMax)) %>%
group_by(countyFIPS, name) %>%
summarize(minMax=min(cumMax), .groups="drop")
dfMinMax %>%
ggplot(aes(x=minMax)) +
geom_density(aes(group=name, color=name)) +
labs(title="Ratio of burden vs. cumulative maximum of burden (expected to be 1 for ascending sequence)",
subtitle="Lowest value per county and metric plotted",
x="Lowest value of daily burden vs. cumulative maximum of burden",
y="Density"
) +
scale_color_discrete("Metric")
dfMinMax %>%
filter(minMax == 0, name=="deaths") %>%
select(countyFIPS) %>%
left_join(cty_newdata_20220913$dfPerCapita, by="countyFIPS") %>%
ggplot(aes(x=date, y=deaths)) +
geom_line(aes(group=countyFIPS)) +
facet_wrap(~countyFIPS, scales="free_y") +
labs(title="Reported deaths by counties with zero following non-zero", x=NULL, y="Reported deaths")
While some of the declines are anomalous, others appear to be curves that are either very low volume or smooth on a rolling-7 basis
The process is repeated to examine issues by state:
keyRatioDateState <- cty_newdata_20220913$dfPerCapita %>%
group_by(state, date) %>%
summarize(across(c(cases, deaths), .fns=function(x) sum(x, na.rm=TRUE)), .groups="drop") %>%
pivot_longer(-c(state, date)) %>%
arrange(state, name, date) %>%
group_by(state, name) %>%
mutate(ratMax=value/max(value, na.rm=TRUE), cumMax=ifelse(cummax(value)==0, 1, value/cummax(value))) %>%
ungroup()
keyRatioDateState
## # A tibble: 97,716 × 6
## state date name value ratMax cumMax
## <chr> <date> <chr> <dbl> <dbl> <dbl>
## 1 AK 2020-01-22 cases 0 0 1
## 2 AK 2020-01-23 cases 0 0 1
## 3 AK 2020-01-24 cases 0 0 1
## 4 AK 2020-01-25 cases 0 0 1
## 5 AK 2020-01-26 cases 0 0 1
## 6 AK 2020-01-27 cases 0 0 1
## 7 AK 2020-01-28 cases 0 0 1
## 8 AK 2020-01-29 cases 0 0 1
## 9 AK 2020-01-30 cases 0 0 1
## 10 AK 2020-01-31 cases 0 0 1
## # … with 97,706 more rows
## # ℹ Use `print(n = ...)` to see more rows
dfMinMaxState <- keyRatioDateState %>%
filter(!is.na(cumMax)) %>%
group_by(state, name) %>%
summarize(minMax=min(cumMax), .groups="drop")
dfMinMaxState %>%
ggplot(aes(x=minMax)) +
geom_density(aes(group=name, color=name)) +
labs(title="Ratio of burden vs. cumulative maximum of burden (expected to be 1 for ascending sequence)",
subtitle="Lowest value per state and metric plotted",
x="Lowest value of daily burden vs. cumulative maximum of burden",
y="Density"
) +
scale_color_discrete("Metric")
dfMinMaxState %>%
filter(minMax < 0.95, name=="deaths") %>%
select(state) %>%
left_join(cty_newdata_20220913$dfPerCapita, by="state") %>%
group_by(state, date) %>%
summarize(across(c(cases, deaths), .fns=function(x) sum(x, na.rm=TRUE)), .groups="drop") %>%
ggplot(aes(x=date, y=deaths)) +
geom_line(aes(group=state)) +
facet_wrap(~state, scales="free_y") +
labs(title="Reported deaths by states with meaningfully non-ascending trend", x=NULL, y="Reported deaths")
Significant restatements appear to be in MA, while missing recent data appears to be in IL. It is unclear if MO and NE are still reporting county-level deaths. The data is potentially less complete and accurate than in previous iterations
The definition of a decline is modified to be min(cur-cummax)/max, so that declines of 1 or 2 early in the data are not flagged as major percentage changes:
keyRatioDateState_v2 <- cty_newdata_20220913$dfPerCapita %>%
group_by(state, date) %>%
summarize(across(c(cases, deaths), .fns=function(x) sum(x, na.rm=TRUE)), .groups="drop") %>%
pivot_longer(-c(state, date)) %>%
arrange(state, name, date) %>%
group_by(state, name) %>%
mutate(ratMax=value/max(value, na.rm=TRUE),
cumMax=ifelse(cummax(value)==0, 1, (value-cummax(value))/max(value, na.rm=TRUE))
) %>%
ungroup()
keyRatioDateState_v2
## # A tibble: 97,716 × 6
## state date name value ratMax cumMax
## <chr> <date> <chr> <dbl> <dbl> <dbl>
## 1 AK 2020-01-22 cases 0 0 1
## 2 AK 2020-01-23 cases 0 0 1
## 3 AK 2020-01-24 cases 0 0 1
## 4 AK 2020-01-25 cases 0 0 1
## 5 AK 2020-01-26 cases 0 0 1
## 6 AK 2020-01-27 cases 0 0 1
## 7 AK 2020-01-28 cases 0 0 1
## 8 AK 2020-01-29 cases 0 0 1
## 9 AK 2020-01-30 cases 0 0 1
## 10 AK 2020-01-31 cases 0 0 1
## # … with 97,706 more rows
## # ℹ Use `print(n = ...)` to see more rows
dfMinMaxState_v2 <- keyRatioDateState_v2 %>%
filter(!is.na(cumMax)) %>%
group_by(state, name) %>%
summarize(minMax=min(cumMax), .groups="drop")
dfMinMaxState_v2 %>%
ggplot(aes(x=fct_reorder(state, -minMax, min), y=1+minMax)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(1+minMax, 2)), hjust=0) +
labs(title="Ratio of burden vs. cumulative maximum of burden (expected to be 1 for ascending sequence)",
subtitle="Lowest value of 1 + (value - cummax(value)) / max(value)",
y="Lowest value",
x=NULL
) +
coord_flip() +
facet_wrap(~name)
dfMinMaxState_v2 %>%
filter(state %in% c("IL", "TX", "MA", "NE")) %>%
select(state) %>%
left_join(cty_newdata_20220913$dfPerCapita, by="state") %>%
group_by(state, date) %>%
summarize(across(c(cases, deaths), .fns=function(x) sum(x, na.rm=TRUE)), .groups="drop") %>%
ggplot(aes(x=date, y=deaths)) +
geom_line(aes(group=state)) +
facet_wrap(~state, scales="free_y") +
labs(title="Reported deaths by states with meaningfully non-ascending trend", x=NULL, y="Reported deaths")
dfMinMaxState_v2 %>%
filter(state %in% c("IL", "WY")) %>%
select(state) %>%
left_join(cty_newdata_20220913$dfPerCapita, by="state") %>%
group_by(state, date) %>%
summarize(across(c(cases, deaths), .fns=function(x) sum(x, na.rm=TRUE)), .groups="drop") %>%
ggplot(aes(x=date, y=cases)) +
geom_line(aes(group=state)) +
facet_wrap(~state, scales="free_y") +
labs(title="Reported cases by states with meaningfully non-ascending trend", x=NULL, y="Reported cases")
The updated methodology better flags states with meaningful restatement problems
The process is converted to functional form:
# Function to calculate distance from global maxima and previous maxima (including self)
findDeltaFromMax <- function(df, groupBy=c(), timeVar="date", numVars=NULL) {
# FUNCTION ARGUMENTS:
# df: a data frame
# groupBy: levels to which the final data should be aggregated
# timeVar: time variable to which data should be aggregated
# numVars: numeric variables to be summarized (NULL means all numeric variables)
# Find numVars if not provided
if(is.null(numVars)) numVars <- df %>% select(where(is.numeric)) %>% names %>% setdiff(groupBy)
df %>%
group_by_at(all_of(c(groupBy, timeVar))) %>%
summarize(across(all_of(numVars), .fns=function(x) sum(x, na.rm=TRUE)), .groups="drop") %>%
pivot_longer(all_of(numVars)) %>%
arrange(across(all_of(c(groupBy, "name", timeVar)))) %>%
group_by_at(all_of(c(groupBy, "name"))) %>%
mutate(ratMax=value/max(value, na.rm=TRUE),
cumMax=ifelse(cummax(value)==0, 1, value/cummax(value)),
delMax=ifelse(cummax(value)==0, 1, (value-cummax(value))/max(value, na.rm=TRUE))
) %>%
ungroup()
}
# Check for states
dfTest <- findDeltaFromMax(cty_newdata_20220913$dfPerCapita, groupBy="state", numVar=c("cases", "deaths"))
identical(dfTest %>% select(-delMax), keyRatioDateState)
## [1] TRUE
identical(dfTest %>% select(-cumMax) %>% rename(cumMax=delMax), keyRatioDateState_v2)
## [1] TRUE
# Check for counties - old approach output NaN rather than 1 when cummax(value)=0
dfTest_v2 <- findDeltaFromMax(cty_newdata_20220913$dfPerCapita,
groupBy=c("countyFIPS", "state"),
numVar=c("cases", "deaths")
)
identical(dfTest_v2 %>% select(-delMax, -cumMax), keyRatioDate %>% select(-cumMax))
## [1] TRUE
all.equal(dfTest_v2 %>% pull(cumMax),
keyRatioDate %>% select(cumMax) %>% mutate(cumMax=ifelse(is.na(cumMax), 1, cumMax)) %>% pull(cumMax)
)
## [1] TRUE
Functions are also written for plot creation:
plotDeltaFromMax <- function(df,
dfCtyData=NULL,
plotStateRatio=FALSE,
plotDeathStates=c(),
plotCaseStates=c()
) {
# FUNCTION ARGUMENTS:
# df: data frame or tibble containing ratios by entity and date
# dfCtyData: county-level per-capita data (not needed if only plotStateRatio is run)
# plotStateRatio: should the state ratio plot be created?
# plotDeathStates: vector of states to plot on the deaths metric (c() means do not plot)
# plotCaseStates: vector of states to plot on the cases metric (c() means do not plot)
# Plot 1: state ratios
if(isTRUE(plotStateRatio)) {
p1 <- df %>%
filter(!is.na(delMax)) %>%
group_by(state, name) %>%
summarize(minMax=min(delMax), .groups="drop") %>%
ggplot(aes(x=fct_reorder(state, -minMax, min), y=1+minMax)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(1+minMax, 2)), hjust=0) +
labs(title="Ratio of burden vs. cumulative maximum of burden (expected to be 1 for ascending sequence)",
subtitle="Lowest value of 1 + (value - cummax(value)) / max(value)",
y="Lowest value",
x=NULL
) +
coord_flip() +
facet_wrap(~name)
print(p1)
}
# Plots 2 and 3: Create case and death data
if((length(plotCaseStates) > 0) | (length(plotDeathStates) > 0)) {
dfPlot <- dfCtyData %>%
select(state, date, cases, deaths) %>%
filter(state %in% all_of(union(plotCaseStates, plotDeathStates))) %>%
group_by(state, date) %>%
summarize(across(c(cases, deaths), .fns=function(x) sum(x, na.rm=TRUE)), .groups="drop")
}
# Plot 2: Death states
if(length(plotDeathStates) > 0) {
p2 <- dfPlot %>%
filter(state %in% all_of(plotDeathStates)) %>%
ggplot(aes(x=date, y=deaths)) +
geom_line(aes(group=state)) +
facet_wrap(~state, scales="free_y") +
labs(title="Reported deaths by states with meaningfully non-ascending trend",
x=NULL,
y="Reported deaths"
)
print(p2)
}
# Plot 3: Case states
if(length(plotCaseStates) > 0) {
p3 <- dfPlot %>%
filter(state %in% all_of(plotCaseStates)) %>%
ggplot(aes(x=date, y=cases)) +
geom_line(aes(group=state)) +
facet_wrap(~state, scales="free_y") +
labs(title="Reported cases by states with meaningfully non-ascending trend",
x=NULL,
y="Reported cases"
)
print(p3)
}
}
plotDeltaFromMax(dfTest, plotStateRatio=TRUE)
plotDeltaFromMax(dfTest,
dfCtyData=cty_newdata_20220913$dfPerCapita,
plotDeathStates=c("IL", "TX", "MA", "NE"),
plotCaseStates=c("IL", "WY")
)